'''
The purpose of this script is to investigate the bot results from 02b.  It will generate histograms by country as well as table of likely bots by country.  For Supplementary Materials.
'''
###########
#
# GLOBALS
#
###########
#setwd('/path/to/replication')

library(plyr)  # round_any.  has to go before dplyr for namespace issues
library(dplyr)
library(stargazer)
library(ggplot2)


# Some of the data will have country level stuff, need to drop.
# Is from Script 5, the regression model script
makeShortData <- function(data, thesecities){
  # Subset cities
  short <- data[data$city_use %in% thesecities,]

  # Get rid of bad observations
  short <- short[short$country != 'EG',]  # Had wrong dates for Egypt
  short <- short[short$city_use != 'Guayabal',]  # Country level for Venezuela
  short <- short[short$city_use != 'Puerto Carreno',]  # Country level for Venezuela as well
  short <- short[short$city_use != 'Baykit',]  # Russia country  level
  short <- short[short$city_use != 'Dobrovelychkivka',]  # Ukraine country level
  short <- short[short$city_use != 'Pir Mahal',]  # Pakistan country level
  short <- short[short$city_use != 'Asilah',]  # A town in Morocco, not sure how it got here

  short <- short[short$country != 'RU',]  # Cross-section, not temporal
  short <- short[short$city_use != 'Smilavichy',]  # Centroid for Belarus, so country level.

  return(short)
}


###########
#
# DATA
#
###########
data <- read.csv('./Data/02_processedData/c2_DonghyeonAlexmerged_classifiers_ShortSpain_dedupDetect.csv', stringsAsFactors=FALSE)

data <- data[is.na(data$deduplicate_id) == FALSE,]  # Because of tweet rot, some images could not be downloaded and tested.  Therefore, should drop.

data$day <- substr(data$hour, 1, 10)
data$weekday <- weekdays(as.Date(data$day))



data <- makeShortData(data)


data$country <- data$place.country_code  # Need to add country like I did earlier for main data

data$country <- gsub("b'", "", data$country)
data$country <- gsub("'", "", data$country)


### Get table of number of times image id duplicated
## 1 = not duplicated
duplicates <- data.frame(table(data$deduplicate_id))
names(duplicates) <- c('deduplicate_id', 'Duplicate_Count')

## Merge back into main data
data <- merge(data, duplicates, by.x='deduplicate_id', by.y='deduplicate_id', all.x=TRUE, sort=FALSE)

data$isdup <- ifelse(data$Duplicate_Count > 1, 1, 0)


originals <- data[!duplicated(data$deduplicate_id),]
duplicates <- data[duplicated(data$deduplicate_id),]

###########
#
# WORK
#
###########
### Figure A22
jpeg('./Figures/DuplicatesHistogram_pooled.jpeg')
hist(log(data$Duplicate_Count, 10), breaks=100, main='', xlab='Log(Number of Duplicates per Image)', xlim=c(0,2.75))
dev.off()


### Table A14
### Density plot of duplicates by cities, not sorted by country
cities <- data.frame(data %>% group_by(city_use) %>% summarize(dup_rate=mean(isdup), tweets=sum(tweets)))
cities[order(cities$dup_rate),]
cities[order(cities$tweets),]  # Interesting.  Is the big cities - Caracas, Barcelona, Seoul, Tsuen Wan.

# Below taken from replication_05_MainWork.Rmd.
thesecities <- c("Barcelona","Ciutat Vella","Girona",  "Granera",  "Granollers",  "Lleida",  "Mataró",   "Reus",     "Sabadell", "Sant Cugat del Vallès",      "Sant Feliu de Pallerols",    "Sant Salvador de Guardiola", "Tarragona","Terrassa", "Central", "Kowloon", "Seoul", "Kimhae",  "Lahore",   "Caracas",  "Valencia","Caucagua", "Boca del Rio","Maracaibo")

# Table A14
papercities <- cities[cities$city_use %in% thesecities,]
stargazer(papercities[order(papercities$dup_rate, decreasing=TRUE),], summary=FALSE, out='./Tables/deduplication_table_byCity.tex')




